# Replication Script for Carnes et al. 2024
# "The Global Legislators Database"

# Load packages
library(pacman)
p_load(kableExtra, tidyverse, ggthemes, countrycode, haven,
       ggrepel, patchwork, ggpmisc, foreign, dplyr, 
       tidyr, purrr, ggplot2, viridis, ggthemes, stringr, readxl, dataverse
)

# Set theme for ggplots
theme_set(theme_bw(base_size=11)) 


# Load GLD country-level dataset
gld_agg <- read.csv("gld_agg.csv")

temp <- gld_agg %>% 
  summarise(college_avg = mean(college_mean, na.rm = T),
            high_educ_avg = mean(mean_higheduc, na.rm = T),
            mean_avg_school_years = mean(average_years_of_schooling, na.rm = T))
# College avg: 87.4%
# High edu avg: 82%
# Avg school years: 14.9

temp1 <- gld_agg %>% 
  group_by(region) %>% 
  summarise(college_avg = mean(college_mean, na.rm = T),
            high_educ_avg = mean(mean_higheduc, na.rm = T),
            mean_avg_school_years = mean(average_years_of_schooling, na.rm = T)); temp1

temp
temp_aus <- gld_agg %>%  filter(country_name == "Australia")
# College mean: 81.6%
# High edu mean: 79%
# Avg years of schooling: 13.7

#Australia Whole population:
## 51%  have any higher education
## 40% have 4-year college degree or higher



# Figure 1 replication: Shares of women legislators in the GLD and V-Dem
n.countries.share.female <- gld_agg %>% filter(!is.na(v2lgfemleg) & !is.na(share_female)) %>% nrow()

gld_agg %>% 
  mutate(share_female = share_female*100) %>% 
  ggplot(aes(x=v2lgfemleg, y=share_female)) +
  geom_point(alpha=.7) +
  annotate("text", 50, 3, label=paste0("N=", n.countries.share.female)) +
  scale_x_continuous(breaks=seq(0, 60, 10)) +
  scale_y_continuous(breaks=seq(0, 60, 10)) +
  geom_text_repel(aes(label = country_text_id), segment.color = "darkgrey",
                  segment.size = 0.3, size=2.5, color="grey20") +
  geom_abline(intercept = 0, slope = 1, size = 0.5) +
  labs(x="V-Dem share of women legislators",
       y="GLD share of women legislators", 
       caption="Note: Bahamas, Belize, Fiji, and Kosovo are ommitted because of missing data in V-Dem.") +
  theme(axis.text.x = element_text(size = rel(.9)),
        axis.text.y = element_text(size = rel(.9)),
        axis.title = element_text(size = rel(.9)),
        plot.caption=element_text(size=7.5, hjust=0, margin=margin(t=15)))


# Figure 2: Legislator age in the GLD and CLD
n.countries.cld.gld.merged <- gld_agg %>% filter(!is.na(age_cld) & !is.na(age_gld)) %>%  nrow()

ggplot(gld_agg, aes(x = age_cld, y = age_gld)) +
  annotate("text", 58, 41, label=paste0("N=", n.countries.cld.gld.merged)) +
  geom_point() +
  # scale_y_continuous(breaks=seq(0, 6, 1), limits=c(0,6)) +
  scale_color_grey(start = 0.8, end = 0.2) +
  scale_fill_grey(start = 0.8, end = 0.2) +
  labs(x = "CLD average age", y = "GLD average age") +
  theme(axis.title.y = element_text(size = 12),
        legend.position = "bottom",
        legend.title = element_blank()) +
  geom_text_repel(aes(label = country_text_id), segment.color = "darkgrey",
                  segment.size = 0.3, size=2.5, color="grey20") +
  geom_abline(intercept = 0, slope = 1, size = 0.5) +
  scale_x_continuous(breaks = seq(40, 60, 5), limits = c(40, 60)) +
  scale_y_continuous(breaks = seq(40, 60, 5), limits = c(40, 60))


# NEXT SECTION COMMENTED OUT TO ENSURE RUN 
# TO CREATE FIG 3, CODE PULLS FROM GLD DATASET 
# POSTED ON DATAVERSE


# load GLD individual-level dataset from Harvard Dataverse (used in left-panel of Figure 3)
# gld <- get_dataframe_by_id(
#  filename = "global_legislator_database.csv",
#  dataset = "doi:10.7910/DVN/KGYJFJ", 
#  server = "dataverse.harvard.edu")


# Figure 3: Distributions of legislator traits in the GLD

## Calculate the number of observations for each plot
# n.countries.gld.age <- gld %>% filter(!is.na(dob)) %>% nrow()
# n.countries.gld.mean_educ <- gld_agg %>% filter(!is.na(mean_higheduc)) %>%  nrow()
# n.countries.gld.mean_work <- gld_agg %>% filter(!is.na(share_workingclass)) %>%  nrow()

## Plot three distributions
# gld %>% filter(!dob %in% c("NA")) %>%
  ## age at the time of election
#  mutate(age = year_of_election - as.numeric(substr(dob, 1, 4))) %>%
#  ggplot(aes(x=age)) +
#  geom_density() +
#  annotate("text", 80, 0.03, label=paste0("N=", n.countries.gld.age)) +
#  labs(x = "Age", y = "Density") +
  
#  gld_agg %>% 
  ## share of legislators with higher education
#  ggplot(aes(x=mean_higheduc)) +
#  geom_density() +
#  annotate("text", .1, 3.25, label=paste0("N=", n.countries.gld.mean_educ)) +
#  scale_x_continuous(labels = scales::percent, breaks = seq(0, 1, .25), limits=c(0,1)) +
#  labs(x = "Higher education", y = "") +
  
#  gld_agg %>% 
  ## share of legislators from working class
#  ggplot(aes(x=share_workingclass)) +
#  geom_density() +
#  annotate("text", .9, 16, label=paste0("N=", n.countries.gld.mean_work)) +
#  scale_x_continuous(labels = scales::percent, breaks = seq(0, 1, .25), limits=c(0,1)) +
#  labs(x = "Working class", y = "") +
  
#  plot_layout(ncol = 3) +
#  plot_annotation(caption = 'Note: Age is calculated at the time of election. Higher education includes levels beyond primary and secondary education (Bachelors, \n Masters, PhD, LLB, LLM, JD, MD, and short-cycle tertiary). Data on educational attainments for legislators is unavailable for Côte d\'Ivoire.')


# Figure 4: Numbers of legislators in the GLD and GLP
n.glp.gld.countries <- gld_agg %>% filter(!is.na(n.leg.glp) & !is.na(total_mps_in_data) & !country_name %in% c("Bhutan", "Fiji")) %>% nrow()

ggplot(gld_agg %>%  filter(!country_name %in% c("Bhutan", "Fiji")), aes(x=n.leg.glp, y = total_mps_in_data)) +
  geom_point() +
  annotate("text", 650, 41, label=paste0("N=", n.glp.gld.countries)) +
  geom_text_repel(aes(label = country_text_id), segment.color = "darkgrey",
                  segment.size = 0.3, size=2.5, color="grey20") +
  geom_abline(intercept = 0, slope = 1, size = 0.5) +
  scale_x_continuous(limits=c(0, 700)) +
  scale_y_continuous(limits=c(0, 700)) +
  labs(x="Number of legislators in GLP",
       y="Number of legislators in GLD") +
  theme(axis.text.x = element_text(size = rel(.9)),
        axis.text.y = element_text(size = rel(.9)),
        axis.title = element_text(size = rel(.9)))



# Figure 5: Reelection rates by years of education, gender, occupational background

# Relection rates: Number of matched Observations 
n.countries.matched = gld_agg %>% filter(!is.na(percent_incumbent)) %>% nrow()
list.countries.not.matched = gld_agg %>% filter(!is.na(percent_incumbent)) %>% select(country_text_id, country_name)

# Scatterplot, shows whether more educated legislators are more likely to be reelected.
n.countries.edu <- gld_agg %>% filter(!is.na(percent_incumbent) & !is.na(average_years_of_schooling)) %>% nrow() 

fig_edu <- 
  gld_agg %>%
  ggplot(aes(y = percent_incumbent, x = average_years_of_schooling)) + 
  annotate("text", 17, 0, label=paste0("N=", n.countries.edu)) +
  geom_point(alpha=.4) + 
  geom_text_repel(aes(label = country_text_id), segment.color = "darkgrey",
                  segment.size = 0.3, size = 2.5, color="grey20") +
  stat_smooth(method = "lm", 
              col = "black", se = F) +
  #  stat_poly_eq(formula = y ~ x, 
  #               aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
  #               parse = TRUE, size=2.5) +
  scale_y_continuous(limits=c(0, 100), breaks=seq(0, 100, 20)) +
  labs(x = "Average legislator years of schooling", 
       y = "Share of incumbents reelected"#,
       # caption="Note: Data on educational attainments for legislators is unavailable for Côte d'Ivoire."
  ) +
  theme(legend.title = element_text(size = rel(.8)),
        axis.text.x = element_text(size = rel(.8)),
        axis.text.y = element_text(size = rel(.8)),
        axis.title = element_text(size = rel(.8)),
        plot.caption=element_text(size=7.5, hjust=0, margin=margin(t=15))) 

# Scatterplot, % women legislators reelected x % women legislators reelected. Shows whether women representatives less likely to be reelected. The axes must be identical. 
n.countries.gld.female <- gld_agg %>% filter(!is.na(female_inc) & !is.na(male_inc)) %>% nrow() 

fig.female_mps_reelected <- 
  ggplot(data = gld_agg, aes(x = female_inc, y = male_inc)) + 
  geom_point(alpha = .4) + 
  annotate("text", .95, 0, label=paste0("N=", n.countries.gld.female)) +
  geom_text_repel(aes(label = country_text_id), segment.color = "darkgrey",
                  segment.size = 0.3, size = 2.5, color = "grey20",
                  max.overlaps = 30) +
  geom_abline(intercept = 0 , slope = 1, size = 1, colour = "grey30") + 
  xlim(0, 1) + 
  ylim(0, 1) +
  labs(x = "Share of women legislators reelected", 
       y = "Share of men legislators reelected") +
  theme(legend.position = "bottom", legend.text = element_text(size = 12),
        legend.title = element_text(size = rel(.8)),
        axis.text.x = element_text(size = rel(.8)),
        axis.text.y = element_text(size = rel(.8)),
        axis.title = element_text(size = rel(.8))) 

# Scatterplot, % working class legislators reelected x % non-working class legislators reelected. Shows whether working class representatives less likely to be reelected. The axes must be identical. 
n.countries.gld.worker <- gld_agg %>% filter(!is.na(workers_inc) & !is.na(non_workers_inc)) %>% nrow() 

fig.working_class_reelected <- 
  ggplot(data = gld_agg, aes(x = workers_inc, y = non_workers_inc)) + 
  geom_point(alpha = .4) + 
  annotate("text", .95, 0, label=paste0("N=", n.countries.gld.worker)) +
  geom_text_repel(aes(label = country_text_id), segment.color = "darkgrey",
                  segment.size = 0.3, size = 2.5, color="grey20") +
  geom_abline(intercept = 0 , slope = 1, size = 1, colour = "grey30") + 
  xlim(0, 1) + 
  ylim(0, 1) +
  labs(x = "Share of working-class legislators reelected", 
       y = "Share of non-working-class \n legislators reelected",
       caption="Note: Share of working-class legislators is zero for six countries that are dropped from \n the  figure:  Albania, Botswana, Cyprus, Estonia, Guatemala, and Mongolia."
  ) +
  theme(legend.position = "bottom", legend.text = element_text(size = 12),
        legend.title = element_text(size = rel(.8)),
        axis.text.x = element_text(size = rel(.8)),
        axis.text.y = element_text(size = rel(.8)),
        axis.title = element_text(size = rel(.8)),
        plot.caption=element_text(size=7.5, hjust=0, margin=margin(t=15))) 

# Stacking_figures
fig_edu + fig.female_mps_reelected + fig.working_class_reelected +
  plot_layout(ncol=1)



# Figure 6: Campaign finance and working-class representation
n.countries.finance <- gld_agg %>% filter(!is.na(v2elpubfin_ord) & !is.na(share_workingclass)) %>% nrow()

gld_agg %>% 
  ggplot(aes(x=v2elpubfin_ord, y=share_workingclass)) +
  geom_point(alpha=.7) +
  annotate("text", 4, -.03, label=paste0("N=", n.countries.finance)) +
  scale_x_continuous(breaks=seq(0, 4, 1)) +
  scale_y_continuous(breaks=seq(0, 1, .1)) +
  geom_text_repel(aes(label = country_text_id), segment.color = "darkgrey",
                  segment.size = 0.3, size=2.5, color="grey20") +
  stat_smooth(method = "lm", 
              col = "black", se = T) +
  #  stat_poly_eq(formula = y ~ x, 
  #               aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
  #               parse = TRUE, size=2.5) +
  labs(x="Campaign finance regulations",
       y="Share of working-class legislators",
       caption="Note: Data on Kosovo, Bahamas and Belize are omitted because of missing data from V-Dem.") +
  theme(axis.text.x = element_text(size = rel(.9)),
        axis.text.y = element_text(size = rel(.9)),
        axis.title = element_text(size = rel(.9)),
        plot.caption=element_text(size=7.5, hjust=0, margin=margin(t=15)))


# Figure 7: Lawyers in the legislature and rule of law
n.countries.lawyers.rol <- gld_agg %>%filter(!is.na(v2x_rule) & !is.na(share_lawyers)) %>% nrow()

ggplot(gld_agg, aes(x=v2x_rule, y=share_lawyers)) +
  geom_point(alpha=.7) +
  annotate("text", 1, -.03, label=paste0("N=", n.countries.lawyers.rol)) +
  geom_text_repel(aes(label = country_text_id), segment.color = "darkgrey",
                  segment.size = 0.3, size=2.5, color="grey20") +
  stat_smooth(method = "lm", 
              col = "black", se = T) +
  #  stat_poly_eq(formula = y ~ x, 
  #               aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
  #               parse = TRUE, size=2.5) +
  labs(x="Rule of law",
       y="Share of lawyers in legislature",
       caption="Note: Bahamas, Belize, and Kosovo are ommitted because of missing data in V-Dem.") +
  theme(axis.text.x = element_text(size = rel(.9)),
        axis.text.y = element_text(size = rel(.9)),
        axis.title = element_text(size = rel(.9)),
        plot.caption=element_text(size=7.5, hjust=0, margin=margin(t=15)))



# Appendix A: Supplemental Analysis

# Section 1: List of Countries, Numbers of Legislators, and Percentage of Missing Values in the GLD
countrywide_variables_tab <- gld_agg %>% 
  mutate(dob_perc_NA = round(100*(1-share.nonmissing.dob)),
         gender_perc_NA = round(100*(1-share.nonmissing.gender)),
         party_perc_NA = round(100*(1-share.nonmissing.party)),
         occupation_per_NA = round(100*(1-share.nonmissing.occupation)),
         education_perc_NA = round(100*(1-share.nonmissing.education))) %>% 
  select(country_name, year_of_election, parliamentary_period, legislature_number, total_mps_in_data, 
         dob_perc_NA, gender_perc_NA, party_perc_NA, occupation_per_NA, education_perc_NA)

kable(countrywide_variables_tab,
      "latex",
      align = 'lccrrrrrrr',
      col.names = c("Country", "Election", "Period", "Leg No", "Obs", "DOB", "Gender", "Party", "Occup", "Edu"),
      caption = "List of Countries and Missingness in GLD",
      booktabs = TRUE,
      longtable = TRUE, 
      linesep = "") %>%
  kable_styling(latex_options = c("hold_position", "repeat_header", "scale_down"),
                full_width = FALSE) %>% 
  kableExtra::footnote(general = c("`Election' is the election year for the legislature included. Missing entry under `Leg No' means legislature numbering not used in the country. Column `Obs' reports the number of legislators included in the GLD.  `DOB' is short for date of birth. `Occup' is short for occupation and `Edu' is short for education. Date of birth, gender, party, occupation, and education columns report the percent of missing data on each characteristic for legislators in the country."),
                       threeparttable=TRUE, footnote_as_chunk=TRUE)



# Section 2: Missingness of Legislator Characteristics by Continent and Geographic Region
data_missing_continent <- gld_agg %>% 
  select(country_name, region, sub.region, share.nonmissing.dob, share.nonmissing.gender, share.nonmissing.party, share.nonmissing.occupation, share.nonmissing.education) %>% 
  group_by(region) %>% 
  summarise(dob_perc_NA = round(mean(100*(1-share.nonmissing.dob))),
         gender_perc_NA = round(mean(100*(1-share.nonmissing.gender))),
         party_perc_NA = round(mean(100*(1-share.nonmissing.party))),
         occupation_per_NA = round(mean(100*(1-share.nonmissing.occupation))),
         education_perc_NA = round(mean(100*(1-share.nonmissing.education))))

kable(data_missing_continent,
      "latex",
      col.names = c("Continent", "DOB", "Gender", "Party", "Occupation", "Education"),
      align = 'lrrrrr',
      caption = "Percent of Missing Data by Characteristic and Continent",
      booktabs = TRUE) %>% 
  kable_styling(latex_options = c("hold_position", "repeat_header", "scale_down"),
                position = "center",
                font_size = 12)




data_missing_subregion <- gld_agg %>% 
  group_by(sub.region) %>% 
  summarise(dob_perc_NA = round(mean(100*(1-share.nonmissing.dob))),
            gender_perc_NA = round(mean(100*(1-share.nonmissing.gender))),
            party_perc_NA = round(mean(100*(1-share.nonmissing.party))),
            occupation_per_NA = round(mean(100*(1-share.nonmissing.occupation))),
            education_perc_NA = round(mean(100*(1-share.nonmissing.education))))

kable(data_missing_subregion,
      "latex",
      col.names = c("Region", "DOB", "Gender", "Party", "Occupation", "Education"),
      align = 'lrrrrr',
      caption = "Percent of Missing Data by Characteristic and Region",
      booktabs = TRUE) %>% 
  kable_styling(latex_options = c("hold_position", "repeat_header", "scale_down"),
                position = "center",
                font_size = 12)



#Section 3: Summary Statistics
age_summary <- gld_agg %>% 
  summarise(mean = round(mean(age_gld),2),
            sd = round(sd(age_gld),2),
            min = round(min(age_gld),2),
            max = round(max(age_gld),2)) 

gender_summary <- gld_agg %>% 
  mutate(share_male = 1 - share_female) %>% 
  summarise(mean = round(mean(share_male, na.rm = TRUE)*100,2),
            sd = round(sd(share_male, na.rm = TRUE)*100,2),
            min = round(min(share_male, na.rm = TRUE)*100,2),
            max = round(max(share_male, na.rm = TRUE)*100,2))

party_summary <- gld_agg %>% 
  summarise(mean = round(mean(number_parties),2),
            sd = round(sd(number_parties),2),
            min = round(min(number_parties),0),
            max = round(max(number_parties),0))

edu_summary <- gld_agg %>% 
  summarise(mean = round(mean(college_mean, na.rm = TRUE)*100,2),
            sd = round(sd(college_mean, na.rm = TRUE)*100,2),
            min = round(min(college_mean, na.rm = TRUE)*100,2),
            max = round(max(college_mean, na.rm = TRUE)*100,2))

working_class <- gld_agg %>% 
  summarise(mean = round(mean(share_workingclass, na.rm = TRUE)*100,2),
            sd = round(sd(share_workingclass, na.rm = TRUE)*100,2),
            min = round(min(share_workingclass, na.rm = TRUE)*100,2),
            max = round(max(share_workingclass, na.rm = TRUE)*100,2))

summary_dat <- rbind(age_summary, gender_summary, party_summary, working_class, edu_summary) %>% 
  add_column("Variable" = c("Average Age", "Percent Male", "Number Parties", "Percent Working-Class", "Percent College Degree")) %>% 
  select(Variable, everything())

kable(summary_dat,
      "latex",
      col.names = c("Characteristic", "Mean", "SD", "Min", "Max"),
      align = 'lrrrr',
      caption = "Summary Statistics of Average Characteristics",
      booktabs = TRUE) %>% 
  kable_styling(latex_options = c("hold_position", "repeat_header", "scale_down"),
                position = "center",
                font_size = 12) %>% 
  kableExtra::footnote(general = c("`Working-class' is defined as ISCO-08 codes 400 and above, excluding unemployed, students, and retirees. `College' is defined as legislators possessing at least a four-year college degree."),
                       threeparttable=TRUE, footnote_as_chunk=TRUE)


# Section 4: Share of Working-Class Legislators by Country
ggplot(data = gld_agg, 
         aes(y = share_workingclass, x = reorder(country_name, share_workingclass))) + 
  geom_col() + coord_flip() +
  ylim(0, 1) + 
  labs(y = "Share of working-class legislators",
       x = "Country") +
  theme(legend.position = "bottom", legend.text = element_text(size = 12),
        legend.title = element_text(size = rel(.9)),
        axis.text.x = element_text(size = rel(.8)),
        axis.text.y = element_text(size = rel(.9)),
        axis.title = element_text(size = rel(.9)),
        plot.caption=element_text(size=7.5, hjust=0, margin=margin(t=15))) 




